import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'C:\Users\Bharg\Data science udemy\Simpli learn classes\Walmart_Store_sales.csv')
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 05-02-2010 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 |
| 1 | 1 | 12-02-2010 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 |
| 2 | 1 | 19-02-2010 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 |
| 3 | 1 | 26-02-2010 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 |
| 4 | 1 | 05-03-2010 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6435 entries, 0 to 6434 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store 6435 non-null int64 1 Date 6435 non-null object 2 Weekly_Sales 6435 non-null float64 3 Holiday_Flag 6435 non-null int64 4 Temperature 6435 non-null float64 5 Fuel_Price 6435 non-null float64 6 CPI 6435 non-null float64 7 Unemployment 6435 non-null float64 dtypes: float64(5), int64(2), object(1) memory usage: 402.3+ KB
df.describe()
| Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|
| count | 6435.000000 | 6.435000e+03 | 6435.000000 | 6435.000000 | 6435.000000 | 6435.000000 | 6435.000000 |
| mean | 23.000000 | 1.046965e+06 | 0.069930 | 60.663782 | 3.358607 | 171.578394 | 7.999151 |
| std | 12.988182 | 5.643666e+05 | 0.255049 | 18.444933 | 0.459020 | 39.356712 | 1.875885 |
| min | 1.000000 | 2.099862e+05 | 0.000000 | -2.060000 | 2.472000 | 126.064000 | 3.879000 |
| 25% | 12.000000 | 5.533501e+05 | 0.000000 | 47.460000 | 2.933000 | 131.735000 | 6.891000 |
| 50% | 23.000000 | 9.607460e+05 | 0.000000 | 62.670000 | 3.445000 | 182.616521 | 7.874000 |
| 75% | 34.000000 | 1.420159e+06 | 0.000000 | 74.940000 | 3.735000 | 212.743293 | 8.622000 |
| max | 45.000000 | 3.818686e+06 | 1.000000 | 100.140000 | 4.468000 | 227.232807 | 14.313000 |
df.isna().sum()
Store 0 Date 0 Weekly_Sales 0 Holiday_Flag 0 Temperature 0 Fuel_Price 0 CPI 0 Unemployment 0 dtype: int64
df.iloc[0]['Date']
'05-02-2010'
type(df.iloc[0]['Date'])
str
df['Date'].dtype
dtype('O')
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
type(df.iloc[0]['Date'])
pandas._libs.tslibs.timestamps.Timestamp
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6435 entries, 0 to 6434 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store 6435 non-null int64 1 Date 6435 non-null datetime64[ns] 2 Weekly_Sales 6435 non-null float64 3 Holiday_Flag 6435 non-null int64 4 Temperature 6435 non-null float64 5 Fuel_Price 6435 non-null float64 6 CPI 6435 non-null float64 7 Unemployment 6435 non-null float64 dtypes: datetime64[ns](1), float64(5), int64(2) memory usage: 402.3 KB
df['Holiday_Flag'].unique()
array([0, 1], dtype=int64)
df['Store'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45], dtype=int64)
df[df.duplicated()]
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment |
|---|
df.groupby('Store').sum()['Weekly_Sales'].nlargest(1).astype('int64')
Store 20 301397792 Name: Weekly_Sales, dtype: int64
df1 = pd.DataFrame(df.groupby('Store').sum()['Weekly_Sales'].astype('int64')).sort_values('Weekly_Sales', ascending=True)
plt.figure(figsize = (14,6), dpi=200)
cols = ['red' if (x == max(df1.Weekly_Sales)) else 'lightpink' for x in df1.Weekly_Sales]
sns.barplot(data= df1, x= df1.index, y='Weekly_Sales', order=df1.sort_values('Weekly_Sales').index, palette=cols);
df.describe()
| Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|
| count | 6435.000000 | 6.435000e+03 | 6435.000000 | 6435.000000 | 6435.000000 | 6435.000000 | 6435.000000 |
| mean | 23.000000 | 1.046965e+06 | 0.069930 | 60.663782 | 3.358607 | 171.578394 | 7.999151 |
| std | 12.988182 | 5.643666e+05 | 0.255049 | 18.444933 | 0.459020 | 39.356712 | 1.875885 |
| min | 1.000000 | 2.099862e+05 | 0.000000 | -2.060000 | 2.472000 | 126.064000 | 3.879000 |
| 25% | 12.000000 | 5.533501e+05 | 0.000000 | 47.460000 | 2.933000 | 131.735000 | 6.891000 |
| 50% | 23.000000 | 9.607460e+05 | 0.000000 | 62.670000 | 3.445000 | 182.616521 | 7.874000 |
| 75% | 34.000000 | 1.420159e+06 | 0.000000 | 74.940000 | 3.735000 | 212.743293 | 8.622000 |
| max | 45.000000 | 3.818686e+06 | 1.000000 | 100.140000 | 4.468000 | 227.232807 | 14.313000 |
df.groupby('Store').std()['Weekly_Sales'].nlargest(1)
Store 14 317569.949476 Name: Weekly_Sales, dtype: float64
df2 = pd.DataFrame(df.groupby('Store').std()['Weekly_Sales']).rename(columns={'Weekly_Sales': 'Weekly sales standard deviation'}).round(3)
df2.head()
| Weekly sales standard deviation | |
|---|---|
| Store | |
| 1 | 155980.768 |
| 2 | 237683.695 |
| 3 | 46319.632 |
| 4 | 266201.442 |
| 5 | 37737.966 |
df3 = pd.DataFrame(df.groupby('Store').mean()['Weekly_Sales']).rename(columns={'Weekly_Sales': 'Weekly sales mean'}).round(3)
df3.head()
| Weekly sales mean | |
|---|---|
| Store | |
| 1 | 1555264.398 |
| 2 | 1925751.336 |
| 3 | 402704.441 |
| 4 | 2094712.961 |
| 5 | 318011.810 |
df2['Weekly sales standard deviation']/df3['Weekly sales mean']
Store 1 0.100292 2 0.123424 3 0.115021 4 0.127083 5 0.118668 6 0.135823 7 0.197305 8 0.116953 9 0.126895 10 0.159133 11 0.122262 12 0.137925 13 0.132514 14 0.157137 15 0.193384 16 0.165181 17 0.125521 18 0.162845 19 0.132680 20 0.130903 21 0.170292 22 0.156783 23 0.179721 24 0.123637 25 0.159860 26 0.110111 27 0.135155 28 0.137330 29 0.183742 30 0.052008 31 0.090161 32 0.118310 33 0.092868 34 0.108225 35 0.229681 36 0.162579 37 0.042084 38 0.110875 39 0.149908 40 0.123430 41 0.148177 42 0.090335 43 0.064104 44 0.081793 45 0.165613 dtype: float64
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['Quarter'] = df['Date'].dt.quarter
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
df.tail()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 6430 | 45 | 2012-09-28 | 713173.95 | 0 | 64.88 | 3.997 | 192.013558 | 8.684 | 2012 | 9 | 3 |
| 6431 | 45 | 2012-10-05 | 733455.07 | 0 | 64.89 | 3.985 | 192.170412 | 8.667 | 2012 | 10 | 4 |
| 6432 | 45 | 2012-10-12 | 734464.36 | 0 | 54.47 | 4.000 | 192.327265 | 8.667 | 2012 | 10 | 4 |
| 6433 | 45 | 2012-10-19 | 718125.53 | 0 | 56.47 | 3.969 | 192.330854 | 8.667 | 2012 | 10 | 4 |
| 6434 | 45 | 2012-10-26 | 760281.43 | 0 | 58.85 | 3.882 | 192.308899 | 8.667 | 2012 | 10 | 4 |
#df4 = pd.DataFrame(df.groupby(['year', 'Quarter', 'Store']).sum()['Weekly_Sales']).astype('int64')
df4 = df[df['year'] == 2012]
df4
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 100 | 1 | 2012-01-06 | 1550369.92 | 0 | 49.01 | 3.157 | 219.714258 | 7.348 | 2012 | 1 | 1 |
| 101 | 1 | 2012-01-13 | 1459601.17 | 0 | 48.53 | 3.261 | 219.892526 | 7.348 | 2012 | 1 | 1 |
| 102 | 1 | 2012-01-20 | 1394393.84 | 0 | 54.11 | 3.268 | 219.985689 | 7.348 | 2012 | 1 | 1 |
| 103 | 1 | 2012-01-27 | 1319325.59 | 0 | 54.26 | 3.290 | 220.078852 | 7.348 | 2012 | 1 | 1 |
| 104 | 1 | 2012-02-03 | 1636339.65 | 0 | 56.55 | 3.360 | 220.172015 | 7.348 | 2012 | 2 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6430 | 45 | 2012-09-28 | 713173.95 | 0 | 64.88 | 3.997 | 192.013558 | 8.684 | 2012 | 9 | 3 |
| 6431 | 45 | 2012-10-05 | 733455.07 | 0 | 64.89 | 3.985 | 192.170412 | 8.667 | 2012 | 10 | 4 |
| 6432 | 45 | 2012-10-12 | 734464.36 | 0 | 54.47 | 4.000 | 192.327265 | 8.667 | 2012 | 10 | 4 |
| 6433 | 45 | 2012-10-19 | 718125.53 | 0 | 56.47 | 3.969 | 192.330854 | 8.667 | 2012 | 10 | 4 |
| 6434 | 45 | 2012-10-26 | 760281.43 | 0 | 58.85 | 3.882 | 192.308899 | 8.667 | 2012 | 10 | 4 |
1935 rows × 11 columns
df4.year.unique()
array([2012], dtype=int64)
df5 = pd.DataFrame(df4.groupby(['Store', 'Quarter']).sum()['Weekly_Sales'])
df5.head(10)
| Weekly_Sales | ||
|---|---|---|
| Store | Quarter | |
| 1 | 1 | 20723762.83 |
| 2 | 20978760.12 | |
| 3 | 20253947.78 | |
| 4 | 6245587.29 | |
| 2 | 1 | 24528220.70 |
| 2 | 25083604.88 | |
| 3 | 24303354.86 | |
| 4 | 7581514.93 | |
| 3 | 1 | 5421809.72 |
| 2 | 5620316.49 |
df5.reset_index(inplace=True)
df_q2 = pd.DataFrame(df5[df5['Quarter'] == 2]['Weekly_Sales'])
df_q2.reset_index(inplace = True)
df_q2.drop('index', axis =1, inplace = True)
df_q2.head()
| Weekly_Sales | |
|---|---|
| 0 | 20978760.12 |
| 1 | 25083604.88 |
| 2 | 5620316.49 |
| 3 | 28454363.67 |
| 4 | 4466363.69 |
df_q3 = pd.DataFrame(df5[df5['Quarter'] == 3]['Weekly_Sales'])
df_q3.reset_index(inplace = True)
df_q3.drop('index', axis =1, inplace = True)
df_q3.head()
| Weekly_Sales | |
|---|---|
| 0 | 20253947.78 |
| 1 | 24303354.86 |
| 2 | 5298005.47 |
| 3 | 27796792.46 |
| 4 | 4163790.99 |
df_q2.rename(columns={'Weekly_Sales': 'Weekly sales_Q2'}, inplace = True)
df_q3.rename(columns={'Weekly_Sales': 'Weekly sales_Q3'}, inplace = True)
df_q3.head()
| Weekly sales_Q3 | |
|---|---|
| 0 | 20253947.78 |
| 1 | 24303354.86 |
| 2 | 5298005.47 |
| 3 | 27796792.46 |
| 4 | 4163790.99 |
result = pd.concat([df_q2, df_q3], axis=1)
result
result['percentage_growth'] = ((result['Weekly sales_Q3'] - result['Weekly sales_Q2'])/result['Weekly sales_Q3']*100)
result.reset_index(inplace = True)
result.head()
| index | Weekly sales_Q2 | Weekly sales_Q3 | percentage_growth | |
|---|---|---|---|---|
| 0 | 0 | 20978760.12 | 20253947.78 | -3.578623 |
| 1 | 1 | 25083604.88 | 24303354.86 | -3.210462 |
| 2 | 2 | 5620316.49 | 5298005.47 | -6.083629 |
| 3 | 3 | 28454363.67 | 27796792.46 | -2.365637 |
| 4 | 4 | 4466363.69 | 4163790.99 | -7.266760 |
result['Store'] = result['index'].apply(lambda x:x+1)
result.drop('index', axis =1, inplace = True)
result
| Weekly sales_Q2 | Weekly sales_Q3 | percentage_growth | Store | |
|---|---|---|---|---|
| 0 | 20978760.12 | 20253947.78 | -3.578623 | 1 |
| 1 | 25083604.88 | 24303354.86 | -3.210462 | 2 |
| 2 | 5620316.49 | 5298005.47 | -6.083629 | 3 |
| 3 | 28454363.67 | 27796792.46 | -2.365637 | 4 |
| 4 | 4466363.69 | 4163790.99 | -7.266760 | 5 |
| 5 | 20833909.92 | 20167312.24 | -3.305337 | 6 |
| 6 | 7290859.27 | 8262787.39 | 11.762715 | 7 |
| 7 | 11919630.95 | 11748952.70 | -1.452710 | 8 |
| 8 | 7484935.11 | 7022149.56 | -6.590369 | 9 |
| 9 | 23750369.17 | 23037258.76 | -3.095466 | 10 |
| 10 | 17787371.95 | 17516081.44 | -1.548808 | 11 |
| 11 | 13362388.58 | 12536324.37 | -6.589365 | 12 |
| 12 | 27009207.14 | 26421259.30 | -2.225283 | 13 |
| 13 | 25155535.41 | 21187560.65 | -18.727851 | 14 |
| 14 | 7955243.07 | 7612081.03 | -4.508124 | 15 |
| 15 | 6564335.98 | 7121541.64 | 7.824228 | 16 |
| 16 | 12592400.93 | 12459453.05 | -1.067044 | 17 |
| 17 | 13896194.65 | 13489765.27 | -3.012872 | 18 |
| 18 | 18367300.24 | 18203554.85 | -0.899524 | 19 |
| 19 | 27524197.32 | 26891526.98 | -2.352675 | 20 |
| 20 | 9294596.35 | 9027599.32 | -2.957564 | 21 |
| 21 | 13487894.06 | 12845139.71 | -5.003872 | 22 |
| 22 | 18488882.82 | 18641489.15 | 0.818638 | 23 |
| 23 | 17684218.91 | 17976377.72 | 1.625237 | 24 |
| 24 | 9323012.09 | 9109081.84 | -2.348538 | 25 |
| 25 | 13155335.57 | 13675691.91 | 3.804973 | 26 |
| 26 | 22744012.75 | 22307711.41 | -1.955832 | 27 |
| 27 | 16506893.13 | 16080704.97 | -2.650308 | 28 |
| 28 | 7125307.50 | 6671234.14 | -6.806437 | 29 |
| 29 | 5742314.29 | 5594701.86 | -2.638432 | 30 |
| 30 | 18267238.50 | 17806714.45 | -2.586238 | 31 |
| 31 | 15489271.05 | 15396528.95 | -0.602357 | 32 |
| 32 | 3549000.39 | 3433620.36 | -3.360302 | 33 |
| 33 | 12853618.02 | 12485995.94 | -2.944275 | 34 |
| 34 | 10838313.00 | 11322421.12 | 4.275659 | 35 |
| 35 | 4151991.58 | 3831691.64 | -8.359231 | 36 |
| 36 | 6824549.37 | 6728068.24 | -1.434009 | 37 |
| 37 | 5637918.82 | 5605482.38 | -0.578656 | 38 |
| 38 | 20214128.46 | 20715116.23 | 2.418465 | 39 |
| 39 | 12727737.53 | 12873195.37 | 1.129928 | 40 |
| 40 | 17659942.73 | 18093844.01 | 2.398060 | 41 |
| 41 | 7568239.27 | 7296759.34 | -3.720555 | 42 |
| 42 | 8168836.35 | 8000572.16 | -2.103152 | 43 |
| 43 | 4306405.78 | 4411251.16 | 2.376772 | 44 |
| 44 | 10390767.83 | 9581268.38 | -8.448771 | 45 |
result = result.set_index('Store')
result
| Weekly sales_Q2 | Weekly sales_Q3 | percentage_growth | |
|---|---|---|---|
| Store | |||
| 1 | 20978760.12 | 20253947.78 | -3.578623 |
| 2 | 25083604.88 | 24303354.86 | -3.210462 |
| 3 | 5620316.49 | 5298005.47 | -6.083629 |
| 4 | 28454363.67 | 27796792.46 | -2.365637 |
| 5 | 4466363.69 | 4163790.99 | -7.266760 |
| 6 | 20833909.92 | 20167312.24 | -3.305337 |
| 7 | 7290859.27 | 8262787.39 | 11.762715 |
| 8 | 11919630.95 | 11748952.70 | -1.452710 |
| 9 | 7484935.11 | 7022149.56 | -6.590369 |
| 10 | 23750369.17 | 23037258.76 | -3.095466 |
| 11 | 17787371.95 | 17516081.44 | -1.548808 |
| 12 | 13362388.58 | 12536324.37 | -6.589365 |
| 13 | 27009207.14 | 26421259.30 | -2.225283 |
| 14 | 25155535.41 | 21187560.65 | -18.727851 |
| 15 | 7955243.07 | 7612081.03 | -4.508124 |
| 16 | 6564335.98 | 7121541.64 | 7.824228 |
| 17 | 12592400.93 | 12459453.05 | -1.067044 |
| 18 | 13896194.65 | 13489765.27 | -3.012872 |
| 19 | 18367300.24 | 18203554.85 | -0.899524 |
| 20 | 27524197.32 | 26891526.98 | -2.352675 |
| 21 | 9294596.35 | 9027599.32 | -2.957564 |
| 22 | 13487894.06 | 12845139.71 | -5.003872 |
| 23 | 18488882.82 | 18641489.15 | 0.818638 |
| 24 | 17684218.91 | 17976377.72 | 1.625237 |
| 25 | 9323012.09 | 9109081.84 | -2.348538 |
| 26 | 13155335.57 | 13675691.91 | 3.804973 |
| 27 | 22744012.75 | 22307711.41 | -1.955832 |
| 28 | 16506893.13 | 16080704.97 | -2.650308 |
| 29 | 7125307.50 | 6671234.14 | -6.806437 |
| 30 | 5742314.29 | 5594701.86 | -2.638432 |
| 31 | 18267238.50 | 17806714.45 | -2.586238 |
| 32 | 15489271.05 | 15396528.95 | -0.602357 |
| 33 | 3549000.39 | 3433620.36 | -3.360302 |
| 34 | 12853618.02 | 12485995.94 | -2.944275 |
| 35 | 10838313.00 | 11322421.12 | 4.275659 |
| 36 | 4151991.58 | 3831691.64 | -8.359231 |
| 37 | 6824549.37 | 6728068.24 | -1.434009 |
| 38 | 5637918.82 | 5605482.38 | -0.578656 |
| 39 | 20214128.46 | 20715116.23 | 2.418465 |
| 40 | 12727737.53 | 12873195.37 | 1.129928 |
| 41 | 17659942.73 | 18093844.01 | 2.398060 |
| 42 | 7568239.27 | 7296759.34 | -3.720555 |
| 43 | 8168836.35 | 8000572.16 | -2.103152 |
| 44 | 4306405.78 | 4411251.16 | 2.376772 |
| 45 | 10390767.83 | 9581268.38 | -8.448771 |
Positive_growth = result[result['percentage_growth']>0].sort_values('percentage_growth')
Positive_growth
| Weekly sales_Q2 | Weekly sales_Q3 | percentage_growth | |
|---|---|---|---|
| Store | |||
| 23 | 18488882.82 | 18641489.15 | 0.818638 |
| 40 | 12727737.53 | 12873195.37 | 1.129928 |
| 24 | 17684218.91 | 17976377.72 | 1.625237 |
| 44 | 4306405.78 | 4411251.16 | 2.376772 |
| 41 | 17659942.73 | 18093844.01 | 2.398060 |
| 39 | 20214128.46 | 20715116.23 | 2.418465 |
| 26 | 13155335.57 | 13675691.91 | 3.804973 |
| 35 | 10838313.00 | 11322421.12 | 4.275659 |
| 16 | 6564335.98 | 7121541.64 | 7.824228 |
| 7 | 7290859.27 | 8262787.39 | 11.762715 |
plt.figure(figsize = (14,6), dpi=200)
cols = ['green' if (x == max(Positive_growth.percentage_growth)) else 'lightgreen' for x in Positive_growth.percentage_growth]
sns.barplot(data= Positive_growth, x = Positive_growth.index, y='percentage_growth', order=Positive_growth.sort_values('percentage_growth').index, palette=cols);
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
Super Bowl: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13 Labour Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13 Thanksgiving: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13 Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13
Holidays_data = pd.DataFrame(df[df['Holiday_Flag'] == 1])
Holidays_data.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 31 | 1 | 2010-09-10 | 1507460.69 | 1 | 78.69 | 2.565 | 211.495190 | 7.787 | 2010 | 9 | 3 |
| 42 | 1 | 2010-11-26 | 1955624.11 | 1 | 64.52 | 2.735 | 211.748433 | 7.838 | 2010 | 11 | 4 |
| 47 | 1 | 2010-12-31 | 1367320.01 | 1 | 48.43 | 2.943 | 211.404932 | 7.838 | 2010 | 12 | 4 |
| 53 | 1 | 2011-02-11 | 1649614.93 | 1 | 36.39 | 3.022 | 212.936705 | 7.742 | 2011 | 2 | 1 |
Holidays_data.month.nunique()
4
Holidays_data.Holiday_Flag.value_counts()
1 450 Name: Holiday_Flag, dtype: int64
Holidays_mean_sales = pd.DataFrame(Holidays_data.groupby('month').mean('Weekly_Sales').round(2))
Holidays_mean_sales
| Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | Quarter | |
|---|---|---|---|---|---|---|---|---|---|
| month | |||||||||
| 2 | 23.0 | 1079127.99 | 1.0 | 35.39 | 3.12 | 170.63 | 8.14 | 2011.0 | 1.0 |
| 9 | 23.0 | 1042427.29 | 1.0 | 74.65 | 3.42 | 172.11 | 7.92 | 2011.0 | 3.0 |
| 11 | 23.0 | 1471273.43 | 1.0 | 48.76 | 3.18 | 170.98 | 8.14 | 2010.5 | 4.0 |
| 12 | 23.0 | 960833.11 | 1.0 | 37.35 | 3.16 | 171.26 | 8.14 | 2010.5 | 4.0 |
df_q3.rename(columns={'Weekly_Sales': 'Weekly sales_Q3'}, inplace = True)
Holidays_mean_sales.reset_index(inplace = True)
Holidays_mean_sales
| month | Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 23.0 | 1079127.99 | 1.0 | 35.39 | 3.12 | 170.63 | 8.14 | 2011.0 | 1.0 |
| 1 | 9 | 23.0 | 1042427.29 | 1.0 | 74.65 | 3.42 | 172.11 | 7.92 | 2011.0 | 3.0 |
| 2 | 11 | 23.0 | 1471273.43 | 1.0 | 48.76 | 3.18 | 170.98 | 8.14 | 2010.5 | 4.0 |
| 3 | 12 | 23.0 | 960833.11 | 1.0 | 37.35 | 3.16 | 171.26 | 8.14 | 2010.5 | 4.0 |
Holidays_mean_sales['month'] = Holidays_mean_sales['month'].map({2: 'Super Bowl', 9: 'Labour Day', 11: 'Thanksgiving', 12: 'Christmas'})
df6 = Holidays_mean_sales[['month', 'Weekly_Sales']]
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
Non_Holiday_mean_sales = df[df['Holiday_Flag'] ==0]['Weekly_Sales'].mean()
Non_Holiday_mean_sales
1041256.3802088564
df6.loc[len(df.index)] = ['Non Holidays mean sales', 1041256.38 ]
df6.set_index('month')
C:\Users\Bharg\AppData\Local\Temp\ipykernel_8820\4229879942.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df6.loc[len(df.index)] = ['Non Holidays mean sales', 1041256.38 ]
| Weekly_Sales | |
|---|---|
| month | |
| Super Bowl | 1079127.99 |
| Labour Day | 1042427.29 |
| Thanksgiving | 1471273.43 |
| Christmas | 960833.11 |
| Non Holidays mean sales | 1041256.38 |
df6.sort_values(by = 'Weekly_Sales', ascending=False)
| month | Weekly_Sales | |
|---|---|---|
| 2 | Thanksgiving | 1471273.43 |
| 0 | Super Bowl | 1079127.99 |
| 1 | Labour Day | 1042427.29 |
| 6435 | Non Holidays mean sales | 1041256.38 |
| 3 | Christmas | 960833.11 |
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
months = np.arange(1, 13).tolist()
months
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
df[(df['year'] == 2010) & (df['month'].isin(months))]
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6335 | 45 | 2010-12-03 | 879244.90 | 0 | 40.93 | 3.046 | 182.667615 | 8.724 | 2010 | 12 | 4 |
| 6336 | 45 | 2010-12-10 | 1002364.34 | 0 | 30.54 | 3.109 | 182.551954 | 8.724 | 2010 | 12 | 4 |
| 6337 | 45 | 2010-12-17 | 1123282.85 | 0 | 30.51 | 3.140 | 182.517732 | 8.724 | 2010 | 12 | 4 |
| 6338 | 45 | 2010-12-24 | 1682862.03 | 0 | 30.59 | 3.141 | 182.544590 | 8.724 | 2010 | 12 | 4 |
| 6339 | 45 | 2010-12-31 | 679156.20 | 1 | 29.67 | 3.179 | 182.571448 | 8.724 | 2010 | 12 | 4 |
2160 rows × 11 columns
df.groupby(['year', 'month'])['Weekly_Sales'].sum().astype('int64')
year month
2010 2 190332983
3 181919802
4 231412368
5 186710934
6 192246172
7 232580125
8 187640110
9 177267896
10 217161824
11 202853370
12 288760532
2011 1 163703966
2 186331327
3 179356448
4 226526510
5 181648158
6 189773385
7 229911398
8 188599332
9 220847738
10 183261283
11 210162354
12 288078102
2012 1 168894471
2 192063579
3 231509650
4 188920905
5 188766479
6 240610329
7 187509452
8 236850765
9 180645544
10 184361680
Name: Weekly_Sales, dtype: int64
plt.figure(figsize=(14,6), dpi=80)
df.groupby(['year', 'month'])['Weekly_Sales'].sum().plot(kind='bar', legend=False, color = 'lightgreen', logy = True, ylabel = 'Sales')
plt.title('monthly Sales');
df.groupby(['year', 'Quarter'])['Weekly_Sales'].sum().astype('int64')
year Quarter
2010 1 372252785
2 610369474
3 597488133
4 708775726
2011 1 529391742
2 597948054
3 639358469
4 681501740
2012 1 592467701
2 618297714
3 605005762
4 184361680
Name: Weekly_Sales, dtype: int64
plt.figure(figsize=(14,6), dpi=80)
df.groupby(['year', 'Quarter'])['Weekly_Sales'].sum().plot(kind='bar', color= 'green', legend=False, logy = True, ylabel = 'Sales')
plt.title('Quarterly Sales');
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
df.head()
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2010-02-05 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 | 2010 | 2 | 1 |
| 1 | 1 | 2010-02-12 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 | 2010 | 2 | 1 |
| 2 | 1 | 2010-02-19 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 | 2010 | 2 | 1 |
| 3 | 1 | 2010-02-26 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 | 2010 | 2 | 1 |
| 4 | 1 | 2010-03-05 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 | 2010 | 3 | 1 |
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Store | 6435.0 | 2.300000e+01 | 12.988182 | 1.000 | 12.000 | 23.000000 | 3.400000e+01 | 4.500000e+01 |
| Weekly_Sales | 6435.0 | 1.046965e+06 | 564366.622054 | 209986.250 | 553350.105 | 960746.040000 | 1.420159e+06 | 3.818686e+06 |
| Holiday_Flag | 6435.0 | 6.993007e-02 | 0.255049 | 0.000 | 0.000 | 0.000000 | 0.000000e+00 | 1.000000e+00 |
| Temperature | 6435.0 | 6.066378e+01 | 18.444933 | -2.060 | 47.460 | 62.670000 | 7.494000e+01 | 1.001400e+02 |
| Fuel_Price | 6435.0 | 3.358607e+00 | 0.459020 | 2.472 | 2.933 | 3.445000 | 3.735000e+00 | 4.468000e+00 |
| CPI | 6435.0 | 1.715784e+02 | 39.356712 | 126.064 | 131.735 | 182.616521 | 2.127433e+02 | 2.272328e+02 |
| Unemployment | 6435.0 | 7.999151e+00 | 1.875885 | 3.879 | 6.891 | 7.874000 | 8.622000e+00 | 1.431300e+01 |
| year | 6435.0 | 2.010965e+03 | 0.797019 | 2010.000 | 2010.000 | 2011.000000 | 2.012000e+03 | 2.012000e+03 |
| month | 6435.0 | 6.447552e+00 | 3.238308 | 1.000 | 4.000 | 6.000000 | 9.000000e+00 | 1.200000e+01 |
| Quarter | 6435.0 | 2.482517e+00 | 1.070036 | 1.000 | 2.000 | 2.000000 | 3.000000e+00 | 4.000000e+00 |
df.corr()['Weekly_Sales'].sort_values()
Store -0.335332 Unemployment -0.106176 CPI -0.072634 Temperature -0.063810 year -0.018378 Fuel_Price 0.009464 Holiday_Flag 0.036891 Quarter 0.063363 month 0.076143 Weekly_Sales 1.000000 Name: Weekly_Sales, dtype: float64
plt.figure(figsize=(12,8),dpi =200)
sns.scatterplot(data = df, x='month', y ='Weekly_Sales', hue = 'Holiday_Flag',alpha = 0.4);
plt.figure(figsize=(12,8),dpi =200)
sns.scatterplot(data = df, x='Quarter', y ='Weekly_Sales', hue = 'Holiday_Flag', alpha = 0.5, palette = 'deep');
plt.figure(figsize=(12,8),dpi =200)
sns.scatterplot(data = df, x='Store', y ='Weekly_Sales', hue = 'Holiday_Flag', alpha = 0.4);
stores = np.arange(1, 21).tolist()
stores
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
df[(df['Weekly_Sales'] >= 3500000) & (df['Store'].isin(stores))]
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 475 | 4 | 2010-12-24 | 3526713.39 | 0 | 43.21 | 2.887 | 126.983581 | 7.127 | 2010 | 12 | 4 |
| 527 | 4 | 2011-12-23 | 3676388.98 | 0 | 35.92 | 3.103 | 129.984548 | 5.143 | 2011 | 12 | 4 |
| 1333 | 10 | 2010-12-24 | 3749057.69 | 0 | 57.06 | 3.236 | 126.983581 | 9.003 | 2010 | 12 | 4 |
| 1762 | 13 | 2010-12-24 | 3595903.20 | 0 | 34.90 | 2.846 | 126.983581 | 7.795 | 2010 | 12 | 4 |
| 1814 | 13 | 2011-12-23 | 3556766.03 | 0 | 24.76 | 3.186 | 129.984548 | 6.392 | 2011 | 12 | 4 |
| 1905 | 14 | 2010-12-24 | 3818686.45 | 0 | 30.59 | 3.141 | 182.544590 | 8.724 | 2010 | 12 | 4 |
| 2763 | 20 | 2010-12-24 | 3766687.43 | 0 | 25.17 | 3.141 | 204.637673 | 7.484 | 2010 | 12 | 4 |
| 2815 | 20 | 2011-12-23 | 3555371.03 | 0 | 40.19 | 3.389 | 212.236040 | 7.082 | 2011 | 12 | 4 |
plt.figure(figsize=(12,8),dpi =200)
sns.scatterplot(data = df, x='Unemployment', y ='Weekly_Sales', hue = 'Holiday_Flag',alpha = 0.4);
df[(df['Weekly_Sales'] >= 3500000) & ((df['Unemployment'] > 6) & (df['Unemployment'] < 10))]
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 475 | 4 | 2010-12-24 | 3526713.39 | 0 | 43.21 | 2.887 | 126.983581 | 7.127 | 2010 | 12 | 4 |
| 1333 | 10 | 2010-12-24 | 3749057.69 | 0 | 57.06 | 3.236 | 126.983581 | 9.003 | 2010 | 12 | 4 |
| 1762 | 13 | 2010-12-24 | 3595903.20 | 0 | 34.90 | 2.846 | 126.983581 | 7.795 | 2010 | 12 | 4 |
| 1814 | 13 | 2011-12-23 | 3556766.03 | 0 | 24.76 | 3.186 | 129.984548 | 6.392 | 2011 | 12 | 4 |
| 1905 | 14 | 2010-12-24 | 3818686.45 | 0 | 30.59 | 3.141 | 182.544590 | 8.724 | 2010 | 12 | 4 |
| 2763 | 20 | 2010-12-24 | 3766687.43 | 0 | 25.17 | 3.141 | 204.637673 | 7.484 | 2010 | 12 | 4 |
| 2815 | 20 | 2011-12-23 | 3555371.03 | 0 | 40.19 | 3.389 | 212.236040 | 7.082 | 2011 | 12 | 4 |
plt.figure(figsize=(12,8),dpi =200)
sns.scatterplot(data = df, x='CPI', y ='Weekly_Sales', hue = 'Holiday_Flag',alpha = 0.4);
df[(df['Weekly_Sales'] >= 3450000) & ((df['CPI'] < 150.0) | (df['CPI'] >180.0))]
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year | month | Quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 475 | 4 | 2010-12-24 | 3526713.39 | 0 | 43.21 | 2.887 | 126.983581 | 7.127 | 2010 | 12 | 4 |
| 527 | 4 | 2011-12-23 | 3676388.98 | 0 | 35.92 | 3.103 | 129.984548 | 5.143 | 2011 | 12 | 4 |
| 1333 | 10 | 2010-12-24 | 3749057.69 | 0 | 57.06 | 3.236 | 126.983581 | 9.003 | 2010 | 12 | 4 |
| 1385 | 10 | 2011-12-23 | 3487986.89 | 0 | 48.36 | 3.541 | 129.984548 | 7.874 | 2011 | 12 | 4 |
| 1762 | 13 | 2010-12-24 | 3595903.20 | 0 | 34.90 | 2.846 | 126.983581 | 7.795 | 2010 | 12 | 4 |
| 1814 | 13 | 2011-12-23 | 3556766.03 | 0 | 24.76 | 3.186 | 129.984548 | 6.392 | 2011 | 12 | 4 |
| 1905 | 14 | 2010-12-24 | 3818686.45 | 0 | 30.59 | 3.141 | 182.544590 | 8.724 | 2010 | 12 | 4 |
| 2763 | 20 | 2010-12-24 | 3766687.43 | 0 | 25.17 | 3.141 | 204.637673 | 7.484 | 2010 | 12 | 4 |
| 2815 | 20 | 2011-12-23 | 3555371.03 | 0 | 40.19 | 3.389 | 212.236040 | 7.082 | 2011 | 12 | 4 |
df.columns
Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
'Fuel_Price', 'CPI', 'Unemployment', 'year', 'month', 'Quarter'],
dtype='object')
plt.figure(figsize=(14,6),dpi=150)
sns.pairplot(df, x_vars=["Holiday_Flag", "Temperature", 'Fuel_Price', 'CPI', 'Unemployment', 'month', 'Quarter'],
y_vars=['Weekly_Sales']);
<Figure size 2100x900 with 0 Axes>